package cn.cake.reptile.entity;

import cn.cake.reptile.util.CharsetDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.UnsupportedEncodingException;

/**
 * @author mengll
 * @date 2019/1/4 14:49
 */
public class Page {

    private String html;

    private byte[] content;

    private Document document;

    private String charset;

    private CrawlStruct crawlStruct;

    public Document doc() {
        if (document != null) {
            return document;
        }
        this.document = Jsoup.parse(html(), crawlStruct.getUrl());
        return document;
    }

    private String html() {
        if (html != null) {
            return html;
        }

        if (content == null) {
            return null;
        }
        if (charset == null) {
            charset = CharsetDetector.guessEncoding(content);
        }
        try {
            html = new String(content, charset);
        } catch (UnsupportedEncodingException e) {
            return null;
        }
        return html;
    }
}
